In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
import os
from datetime import datetime, timedelta
from pathlib import Path
import numpy
import pandas
from sqlalchemy import select, func
from bsky_topics.config import Config
from bsky_topics.db import configure_db, async_session
from bsky_topics.db.schema import Post, PostEmbedding
from bsky_topics.topics import compute_topics, get_indexed_posts_for_date_range
In [3]:
CONFIG_FILE = "../env.toml"
In [4]:
config = Config.load(CONFIG_FILE)
configure_db(config.db_url)
Out[4]:
<sqlalchemy.ext.asyncio.engine.AsyncEngine at 0x16a5efc90>
In [5]:
from bertopic import BERTopic
loaded_model = BERTopic.load("saved_models/2024-12-01 180000")
In [6]:
async with async_session() as session:
curr_date = datetime(year=2024, month=12, day=1, hour=18)
block_end = curr_date + timedelta(hours=3)
stmt = (select(Post.post_text)
.join(PostEmbedding)
.filter(Post.indexed_at >= curr_date, Post.indexed_at < block_end))
post_texts = []
for post_text in await session.execute(stmt):
post_texts.append(post_text[0])
hierarchical_topics = loaded_model.hierarchical_topics(post_texts)
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4505/4505 [00:33<00:00, 133.25it/s]
In [7]:
loaded_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics, top_n_topics=50)
In [8]:
topics_with_at_least50 = {t for t, count in loaded_model.topic_sizes_.items() if count >= 50 and t >= 0}
In [9]:
async with async_session() as session:
curr_date = datetime(year=2024, month=12, day=1, hour=18)
block_end = curr_date + timedelta(hours=3)
stmt = (select(Post.post_text, Post.indexed_at)
.join(PostEmbedding)
.filter(Post.indexed_at >= curr_date, Post.indexed_at < block_end))
post_texts = []
timestamps = []
topics = []
for i, (post_text, timestamp) in enumerate(await session.execute(stmt)):
if loaded_model.topics_[i] in topics_with_at_least50:
post_texts.append(post_text)
timestamps.append(timestamp)
topics.append(loaded_model.topics_[i])
In [10]:
print(len(topics))
187760
In [11]:
loaded_model.verbose = True
topics_over_time = loaded_model.topics_over_time(post_texts, timestamps, topics=topics, nr_bins=20, global_tuning=False, evolution_tuning=False)
18it [00:05, 3.30it/s]
In [13]:
loaded_model.visualize_topics_over_time(topics_over_time, top_n_topics=50)
In [14]:
loaded_model.visualize_topics(top_n_topics=50)
[autoreload of bsky_topics.db.schema failed: Traceback (most recent call last):
File "/Users/lucas/Projects/atproto/bsky_topics/.venv/lib/python3.12/site-packages/IPython/extensions/autoreload.py", line 276, in check
superreload(m, reload, self.old_objects)
File "/Users/lucas/Projects/atproto/bsky_topics/.venv/lib/python3.12/site-packages/IPython/extensions/autoreload.py", line 500, in superreload
update_generic(old_obj, new_obj)
File "/Users/lucas/Projects/atproto/bsky_topics/.venv/lib/python3.12/site-packages/IPython/extensions/autoreload.py", line 397, in update_generic
update(a, b)
File "/Users/lucas/Projects/atproto/bsky_topics/.venv/lib/python3.12/site-packages/IPython/extensions/autoreload.py", line 335, in update_class
if (old_obj == new_obj) is True:
^^^^^^^^^^^^^^^^^^
File "/Users/lucas/Projects/atproto/bsky_topics/.venv/lib/python3.12/site-packages/sqlalchemy/sql/operators.py", line 582, in __eq__
return self.operate(eq, other)
^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/lucas/Projects/atproto/bsky_topics/.venv/lib/python3.12/site-packages/sqlalchemy/orm/attributes.py", line 453, in operate
return op(self.comparator, *other, **kwargs) # type: ignore[no-any-return] # noqa: E501
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/lucas/Projects/atproto/bsky_topics/.venv/lib/python3.12/site-packages/sqlalchemy/orm/relationships.py", line 762, in __eq__
self.property._optimized_compare(
File "/Users/lucas/Projects/atproto/bsky_topics/.venv/lib/python3.12/site-packages/sqlalchemy/orm/relationships.py", line 1185, in _optimized_compare
raise sa_exc.ArgumentError(
sqlalchemy.exc.ArgumentError: Mapped instance expected for relationship comparison to object. Classes, queries and other SQL elements are not accepted in this context; for comparison with a subquery, use PostEmbedding.post.has(**criteria).
]
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
In [ ]: